from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle
import os
from platform import python_version
print('python',python_version())
import matplotlib
font = {'family' : 'normal',
'weight' : 'bold',
'size' : 15}
matplotlib.rc('font', **font)
%matplotlib inline
plt.style.use('dark_background')
Feature:
Target input:
wdhr0 wdhr1 wdhr2 ... wdh23 wehr0 wehr1 wehr2 ... weh23
id1loc1
id1loc2
id2loc1
.
.
.
idnlocn
a. Load data
data = pd.read_csv('data.csv',header=None)
data.columns = ['ID','make','device','start_time','end_time','lat','long','place','strength']
data['dummy'] = 1
data.shape
b. First 5 rows
data.iloc[:,1:].head()
c. Data types
data.info()
print('Number of unique IDs: {}'.format(len(set(data.ID))))
print('Min start time: {}\tMax start time: {}'.format(min(data.start_time), max(data.start_time)))
print('Min end time: {}\tMax end time: {}'.format( min(data.end_time), max(data.end_time)))
cnt = data[['place','dummy']].groupby(['place']).sum().reset_index()
cnt.columns = ['place','count']
cnt['percentage'] = 100*(cnt['count']/len(data))
cnt
cnt = data[['strength','dummy']].groupby(['strength']).sum().reset_index()
cnt.columns = ['strength','count']
cnt['percentage'] = 100*(cnt['count']/len(data))
cnt
cnt = data[['place','strength','dummy']].groupby(['place','strength']).sum().reset_index()
cnt.columns = ['place','strength','count']
cnt['percentage'] = 100*(cnt['count']/len(data))
cnt
day = (5,10)
hour = (11,13)
data['start_day'] = data['start_time'].map(lambda x: x[day[0]:day[1]])
data['start_hour'] = data['start_time'].map(lambda x: x[hour[0]:hour[1]])
data['end_hour'] = data['end_time'].map(lambda x: x[hour[0]:hour[1]])
data['hour'] = data.start_hour + '-' + data.end_hour
data_agg = data[['ID','start_day','hour','lat','long','dummy']].groupby(['ID','start_day','hour','lat','long']).sum().reset_index()
data_agg.columns = ['id','s_day','hour','lat','long','count']
print(sorted(set(data_agg.s_day)))
print(data.shape, data_agg.shape, len(set(data_agg.hour)),sorted(set(data_agg.hour)))
df_count = data_agg[['id','s_day','count']].groupby(['id','s_day']).count().reset_index()
ax = df_count['count'].hist(bins=100)
ax.set_xlabel('Presence')
ax.set_ylabel('Frequency')
#['07-18', '07-19', '07-20', '07-21', '07-22', '07-23', '07-24', '07-25', '07-26', '07-27', '07-28', '07-29', '07-30', '07-31', '08-01']
data_agg['day'] = data_agg.s_day.map(lambda x: 'we' if x in ['07-20','07-21','07-27','07-28'] else 'wd')
data_agg['latlong'] = data_agg.lat.map(str) + '-' + data_agg.long.map(str)
data_agg.sort_values(['id','s_day','hour'],ascending=[True,True,True], inplace=True)
data_agg.shape
def process_hour(hour_str):
#start and end hour is the same
if hour_str[:2] == hour_str[-2:]:
hr_end = int(hour_str[-2:]) + 1
hr_end = "{0:02d}".format(hr_end)
hour_str = hour_str[:2] +'-'+ hr_end
# end hour that is '00' should be '24'
if hour_str[-2:] == '00':
hr_end = "{0:02d}".format(24)
hour_str = hour_str[:2] +'-'+ hr_end
# end hour that is less than the start hour, means extends to the next day? just put it to 24
if (int(hour_str[-2:]) < int(hour_str[:2])): #end<start [23:01]
hr_end = "{0:02d}".format(24)
hour_str = hour_str[:2] +'-'+ hr_end
return hour_str
def process_more_hours(df):
idxs_to_exclude = list(range(len(df)))
for row in range(len(df)):
df_id, df_day,df_hr,df_loc,df_cnt = df.iloc[row]['id'],df.iloc[row]['day'],df.iloc[row]['hr'], df.iloc[row]['latlong'], df.iloc[row]['count']
hr_start = int(df_hr[:2])
hr_end = int(df_hr[-2:])
for hr in list(range(hr_start,hr_end,1)):
hr_new = "{0:02d}".format(hr) +'-'+ "{0:02d}".format(hr+1)
df_add = pd.DataFrame({"id":[df_id],
"day":[df_day],
"hr":[hr_new],
"latlong":[df_loc],
"count":[df_cnt] })
df = df.append(df_add, ignore_index = True,sort=True)
df.drop(idxs_to_exclude,inplace=True)
df.drop(['index'], axis=1, inplace=True)
return df
data_agg['hr'] = data_agg.hour.map(lambda x: process_hour(x))
data_agg['hr_range'] = data_agg.hr.map(lambda x: int(x[-2:])-int(x[:2]))
df_1hr = data_agg[data_agg.hr_range <= 1][['id','day','hr','latlong','count']]
df_gt1hr = data_agg[data_agg.hr_range >= 2][['id','day','hr','latlong','count']].reset_index()
assert df_1hr.shape[0] + df_gt1hr.shape[0] == data_agg.shape[0]
df = process_more_hours(df_gt1hr)
data_agg = pd.concat([df_1hr,df], axis=0,sort=False)
assert df_1hr.shape[0] + df.shape[0] == data_agg.shape[0]
data_agg_sum = data_agg.groupby(['id', 'day', 'hr', 'latlong']).sum().reset_index()
data_agg_sum.sort_values(['id', 'day', 'hr', 'latlong'], inplace=True)
data_agg_sum.shape
print('Processed hours: {}\nTotal number of hours:{}'.format(sorted(set(data_agg_sum.hr)),len(set(data_agg_sum.hr))))
data_agg_sum['hr'] = data_agg_sum.hr.map(lambda x: x.replace('-','_'))
data_agg_sum['wday_hr'] = data_agg_sum.day + data_agg_sum.hr
data_agg_sum.head()
a. Pivot to make the 48 weekday/weekend hour features
df_pivot = pd.pivot_table(data_agg_sum, values = 'count', index=['id','latlong'], columns = 'wday_hr').reset_index()
df_pivot = df_pivot.fillna(0)
assert len(set(df_pivot.id.astype(str) + ':'+ df_pivot.latlong)) == df_pivot.shape[0]
df_pivot.shape
b. Quick stats
df_pivot.describe()
cols = ['wd00_01', 'wd01_02', 'wd02_03', 'wd03_04', 'wd04_05',
'wd05_06', 'wd06_07', 'wd07_08', 'wd08_09', 'wd09_10', 'wd10_11',
'wd11_12', 'wd12_13', 'wd13_14', 'wd14_15', 'wd15_16', 'wd16_17',
'wd17_18', 'wd18_19', 'wd19_20', 'wd20_21', 'wd21_22', 'wd22_23',
'wd23_24', 'we00_01', 'we01_02', 'we02_03', 'we03_04', 'we04_05',
'we05_06', 'we06_07', 'we07_08', 'we08_09', 'we09_10', 'we10_11',
'we11_12', 'we12_13', 'we13_14', 'we14_15', 'we15_16', 'we16_17',
'we17_18', 'we18_19', 'we19_20', 'we20_21', 'we21_22', 'we22_23',
'we23_24']
colswd = [col for col in cols if 'wd' in col ]
colswe = [col for col in cols if 'we' in col ]
df_pivot_sum_loc = df_pivot.groupby(['id']).sum().reset_index()
df_pivot_merge_sum = pd.merge(df_pivot, df_pivot_sum_loc, on='id', how='left')
orig_cols = [col for col in df_pivot_merge_sum.columns if '_x' in col]
for col in orig_cols:
df_pivot_merge_sum[col[:len(col)-2]] = df_pivot_merge_sum[col].map(float) / df_pivot_merge_sum[col[:len(col)-2] + '_y']
interested_cols = [col for col in df_pivot_merge_sum.columns if ('_x' not in col) & ('_y' not in col)]
df_pivot_merge_sum = df_pivot_merge_sum[interested_cols].fillna(0.0)
df_pivot_merge_sum['IDloc'] = df_pivot_merge_sum['id'].map(str) + '_' + df_pivot_merge_sum.latlong
df_pivot_merge_sum.drop(['id','latlong'],axis=1, inplace=True)
a. Quick stats
df_pivot_merge_sum.describe()
exclude = ['id','latlong']
for col in exclude:
interested_cols.remove(col)
b. Final preprocessed data: show first 5 rows
df_pivot_merge_sum.head()
Assumption: human’s presence patterns at user locations with similar functions are similar across the sample population
from sklearn.decomposition import PCA
features = interested_cols
x = df_pivot_merge_sum.loc[:,features].values
print('data shape: ',np.shape(x))
pca = PCA(random_state = 8888) #(n_components=3)
principalComponents = pca.fit_transform(x)
#principalDf = pd.DataFrame(data = principalComponents
# , columns = ['prinComp1', 'prinComp2','prinComp3'])
# for new data, just use:
# pca_x_new = pca.transform(x_new)
a. 48 PCs
principalDf = pd.DataFrame(data = principalComponents)
principalDf.head()
#eigenvectors
print('eigenvectors shape: ', np.shape(pca.components_))
# eigenvalues
print('eigenvalues: ', pca.explained_variance_)
print('explained_variance of each eigenvalue: ',pca.explained_variance_ratio_)
Looking under the hood of PCA, on GitHub, reveals that the fitting PCA.fit method is just a wrapper for PCA._fit(), which returns the PCA object itself to allow for chaining method calls. The fit() method performs a SVD on the data matrix, and sets the field pca.components to the first n_components columns of the right singular matrix. The rows of this new matrix will be the Loading points!
loadings = pca.components_
plt.figure(figsize=(10,5))
plt.plot(loadings[0],color='pink',label='First')
plt.plot(loadings[1],color='lightblue',label='Second')
plt.plot(loadings[2],color='red',label='Third')
plt.plot(loadings[3],color='blue',label='Fourth')
plt.axvline(x=7,linestyle='--',color='gray')
plt.axvline(x=18,linestyle='--',color='gray')
plt.axvline(x=24,linestyle='-',color='white')
plt.axvline(x=31,linestyle='--',color='gray')
plt.axvline(x=40,linestyle='--',color='gray')
plt.xlabel('Hour')
plt.ylabel('Loadings')
plt.legend()
plt.figure(figsize=(10,5))
plt.plot(loadings[4],color='pink',label='Fifth')
plt.plot(loadings[5],color='lightblue',label='Sixth')
plt.plot(loadings[6],color='red',label='Seventh')
plt.plot(loadings[7],color='blue',label='Eight')
plt.axvline(x=7,linestyle='--',color='gray')
plt.axvline(x=18,linestyle='--',color='gray')
plt.axvline(x=24,linestyle='-',color='white')
plt.axvline(x=31,linestyle='--',color='gray')
plt.axvline(x=40,linestyle='--',color='gray')
plt.xlabel('Hour')
plt.ylabel('Loadings')
plt.legend()
From 7th onwards, the pattern is getting more like noise...
plt.figure(figsize=(10,5))
plt.plot(loadings[6],color='red',label='Seventh')
plt.plot(loadings[7],color='blue',label='Eight')
plt.axvline(x=7,linestyle='--',color='gray')
plt.axvline(x=18,linestyle='--',color='gray')
plt.axvline(x=24,linestyle='-',color='white')
plt.axvline(x=31,linestyle='--',color='gray')
plt.axvline(x=40,linestyle='--',color='gray')
plt.xlabel('Hour')
plt.ylabel('Loadings')
plt.legend()
plt.figure(figsize=(10,5))
#plt.plot(loadings[0],color='pink',label='First')
plt.plot(loadings[1],color='lightblue',label='Second')
plt.plot(loadings[2],color='red',label='Third')
#plt.plot(loadings[3],color='blue',label='Fourth')
#plt.plot(loadings[4],color='gray',label='Fifth')
#plt.plot(loadings[5],color='white',label='Sixth')
plt.axvline(x=7,linestyle='--',color='gray')
plt.axvline(x=18,linestyle='--',color='gray')
plt.axvline(x=24,linestyle='-',color='white')
plt.axvline(x=31,linestyle='--',color='gray')
plt.axvline(x=40,linestyle='--',color='gray')
plt.xlabel('Hour')
plt.ylabel('Loadings')
plt.legend()
# 2 suggests that people work more on the weekend; while
# 3 suggests that people work more on the weekday
plt.figure(figsize=(10,5))
#plt.plot(loadings[0],color='pink',label='First')
#plt.plot(loadings[1],color='lightblue',label='Second')
#plt.plot(loadings[2],color='red',label='Third')
plt.plot(loadings[3],color='blue',label='Fourth')
#plt.plot(loadings[4],color='gray',label='Fifth')
plt.plot(loadings[5],color='white',label='Sixth')
plt.axvline(x=7,linestyle='--',color='gray')
plt.axvline(x=18,linestyle='--',color='gray')
plt.axvline(x=24,linestyle='-',color='white')
plt.axvline(x=31,linestyle='--',color='gray')
plt.axvline(x=40,linestyle='--',color='gray')
plt.xlabel('Hour')
plt.ylabel('Loadings')
plt.legend()
# 4 and 6 have similar pattern on weekdays
# 4 suggests that during weekend, people stay home a lot in the evening, but are somewhere elese in the morning; while
# 6 suggests that during weekend, people are at home in the morning, but are somewhere else in the evening
#Calculating Eigenvecors and eigenvalues of Covariance matrix
mean_vec = np.mean(x, axis=0)
cov_mat = np.cov(x.T)
eig_vals, eig_vecs = np.linalg.eig(cov_mat)
# Create a list of (eigenvalue, eigenvector) tuples
eig_pairs = [ (np.abs(eig_vals[i]),eig_vecs[:,i]) for i in range(len(eig_vals))]
# Sort from high to low
eig_pairs.sort(key = lambda x: x[0], reverse= True)
# Calculation of Explained Variance from the eigenvalues
tot = sum(eig_vals)
var_exp = [(i/tot)*100 for i in sorted(eig_vals, reverse=True)] # Individual explained variance
cum_var_exp = np.cumsum(var_exp) # Cumulative explained variance
# PLOT OUT THE EXPLAINED VARIANCES SUPERIMPOSED
plt.figure(figsize=(16, 8))
plt.bar(range(len(var_exp)), var_exp, alpha=0.3333, align='center', label='individual explained variance', color = 'g')
plt.step(range(len(cum_var_exp)), cum_var_exp, where='mid',label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc='best')
plt.show()
from sklearn.cluster import KMeans
num_clusters = 4
kmeans = KMeans(n_clusters = num_clusters,random_state = 8888)
#Compute cluster centers and predict cluster indices
X_clustered = kmeans.fit_predict(principalComponents[:,[0,1,2,3,4,5]])
df_pivot_merge_sum['pca_cluster'] = X_clustered
print(df_pivot_merge_sum[['pca_cluster','IDloc']].groupby(['pca_cluster']).count().reset_index())
cluster_0 = df_pivot_merge_sum[df_pivot_merge_sum.pca_cluster==0][interested_cols]
cluster_1 = df_pivot_merge_sum[df_pivot_merge_sum.pca_cluster==1][interested_cols]
cluster_2 = df_pivot_merge_sum[df_pivot_merge_sum.pca_cluster==2][interested_cols]
cluster_3 = df_pivot_merge_sum[df_pivot_merge_sum.pca_cluster==3][interested_cols]
#Home hours: home hours (before 8 am and after 7 pm) as most other research
plt.figure(figsize=(10,6))
plt.plot(list(cluster_0.mean()), color='gray', label='Others')
plt.plot(list(cluster_1.mean()), color='orange',label='Home1')
plt.plot(list(cluster_2.mean()), color='green', label='Work?')
plt.plot(list(cluster_3.mean()), color='red', label='Home2?')
plt.axvline(x=7,linestyle='--',color='gray')
plt.axvline(x=18,linestyle='--',color='gray')
plt.axvline(x=24,linestyle='-',color='white')
plt.axvline(x=31,linestyle='--',color='gray')
plt.axvline(x=40,linestyle='--',color='gray')
plt.xlabel('Hour')
plt.ylabel('Normalize Hourly Presences')
plt.legend()
num_clusters = 4
kmeans = KMeans(n_clusters = num_clusters,random_state = 8888)
#Compute cluster centers and predict cluster indices
X_clustered = kmeans.fit_predict(principalComponents[:,[1,2,3,4,5]])
df_pivot_merge_sum['pca_cluster'] = X_clustered
print(df_pivot_merge_sum[['pca_cluster','IDloc']].groupby(['pca_cluster']).count().reset_index())
cluster_0 = df_pivot_merge_sum[df_pivot_merge_sum.pca_cluster==0][interested_cols]
cluster_1 = df_pivot_merge_sum[df_pivot_merge_sum.pca_cluster==1][interested_cols]
cluster_2 = df_pivot_merge_sum[df_pivot_merge_sum.pca_cluster==2][interested_cols]
cluster_3 = df_pivot_merge_sum[df_pivot_merge_sum.pca_cluster==3][interested_cols]
#Home hours: home hours (before 8 am and after 7 pm) as most other research
plt.figure(figsize=(10,6))
plt.plot(list(cluster_0.mean()), color='gray', label='Others')
plt.plot(list(cluster_1.mean()), color='orange',label='Home1')
plt.plot(list(cluster_2.mean()), color='red', label='Home2')
plt.plot(list(cluster_3.mean()), color='green', label='Work')
plt.axvline(x=7,linestyle='--',color='gray')
plt.axvline(x=18,linestyle='--',color='gray')
plt.axvline(x=24,linestyle='-',color='white')
plt.axvline(x=31,linestyle='--',color='gray')
plt.axvline(x=40,linestyle='--',color='gray')
plt.xlabel('Hour')
plt.ylabel('Normalize Hourly Presences')
plt.legend()
df_pivot_merge_sum['id'] = df_pivot_merge_sum.IDloc.map(lambda x: x.split(sep='_')[0])
df_pivot_merge_sum['lat'] = df_pivot_merge_sum.IDloc.map(lambda x: float(x.split(sep='_')[1].split('-')[0]))
df_pivot_merge_sum['long'] = df_pivot_merge_sum.IDloc.map(lambda x:float(x.split(sep='_')[1].split('-')[1]))
sid = list(set(df_pivot_merge_sum[df_pivot_merge_sum.pca_cluster==0].id))
len(sid)
for s_id in sid:
LABEL_COLOR_MAP = {0:['silver',10], 1: ['orange',100], 2: ['red',100], 3: ['green',100]}
plt.figure(figsize=(10,6))
for i in range(0,len(set(df_pivot_merge_sum.pca_cluster))):
plt.scatter(df_pivot_merge_sum[(df_pivot_merge_sum.pca_cluster==i) & (df_pivot_merge_sum.id==s_id)]['long'],df_pivot_merge_sum[(df_pivot_merge_sum.pca_cluster==i) & (df_pivot_merge_sum.id==s_id)]['lat'],color=LABEL_COLOR_MAP[i][0],s=LABEL_COLOR_MAP[i][1])
#plt.title('ID: ' + str(s_id))
plt.xlabel('Longitude')
plt.ylabel('Latitude')
PCA finds the set of vectors that best explains the variability of the data, while ICA finds a set of independent vectors that can reconstruct the data.
PCA helps to compress data and ICA helps to separate data.
from sklearn.decomposition import FastICA, PCA
NUM_COMPONENTS = 4
# Compute ICA
ica = FastICA(n_components=NUM_COMPONENTS)
S_ = ica.fit_transform(x) # Reconstruct signals
A_ = ica.mixing_.transpose() # Get estimated mixing matrix
# For comparison, compute PCA
pca = PCA(n_components=NUM_COMPONENTS)
H = pca.fit_transform(x) # Reconstruct signals based on orthogonal components
EV_ = pca.components_
plt.figure(figsize=(12,12))
models = [A_, EV_]
names = ["ICA","PCA"]
component_colors = ['red', 'steelblue', 'orange',"green","black"]
for ii, (model, name) in enumerate(zip(models, names), 1):
plt.subplot(2, 1, ii)
plt.title(name)
for component, num in zip(model, list(range(NUM_COMPONENTS))):
plt.plot(component,label = "Comp {0}".format(num))
plt.legend(fancybox=True, framealpha=0.9, shadow=True, borderpad=1)
plt.axvline(x=7,linestyle='--',color='gray')
plt.axvline(x=18,linestyle='--',color='gray')
plt.axvline(x=24,linestyle='-',color='white')
plt.axvline(x=31,linestyle='--',color='gray')
plt.axvline(x=40,linestyle='--',color='gray')
plt.xlabel('Hour')
plt.ylabel('Loadings')
plt.show()
NUM_COMPONENTS = 8
# Compute ICA|
ica = FastICA(n_components=NUM_COMPONENTS, random_state=8888)
S_ = ica.fit_transform(x) # Reconstruct signals
A_ = ica.mixing_.transpose() # Get estimated mixing matrix
kmeans = KMeans(n_clusters=num_clusters, random_state=8888)
X_clustered = kmeans.fit_predict(S_)
df_pivot_merge_sum['ica_cluster'] = X_clustered
print(df_pivot_merge_sum[['ica_cluster','IDloc']].groupby(['ica_cluster']).count().reset_index())
cluster_0 = df_pivot_merge_sum[df_pivot_merge_sum.ica_cluster==0][interested_cols]
cluster_1 = df_pivot_merge_sum[df_pivot_merge_sum.ica_cluster==1][interested_cols]
cluster_2 = df_pivot_merge_sum[df_pivot_merge_sum.ica_cluster==2][interested_cols]
cluster_3 = df_pivot_merge_sum[df_pivot_merge_sum.ica_cluster==3][interested_cols]
#Home hours: home hours (before 8 am and after 7 pm) as most other research
plt.figure(figsize=(10,6))
plt.plot(list(cluster_0.mean()), color='gray', label='Others')
plt.plot(list(cluster_1.mean()), color='green',label='Work')
plt.plot(list(cluster_2.mean()), color='orange', label='Home1')
plt.plot(list(cluster_3.mean()), color='red', label='Home2')
plt.axvline(x=7,linestyle='--',color='gray')
plt.axvline(x=18,linestyle='--',color='gray')
plt.axvline(x=24,linestyle='-',color='white')
plt.axvline(x=31,linestyle='--',color='gray')
plt.axvline(x=40,linestyle='--',color='gray')
plt.xlabel('Hour')
plt.ylabel('Normalize Hourly Presences')
plt.legend()
def ica_cluster_name(cluster_number):
if cluster_number == 0:
return "Others"
elif cluster_number == 1:
return "Work"
elif cluster_number == 2:
return "Home1"
elif cluster_number == 3:
return "Home2"
else:
return "None"
def pca_cluster_name(cluster_number):
if cluster_number == 0:
return "Others"
elif cluster_number == 1:
return "Home1"
elif cluster_number == 2:
return "Home2"
elif cluster_number == 3:
return "Work"
else:
return "None"
cnt = df_pivot_merge_sum[['ica_cluster','id']].drop_duplicates()
cnt = cnt.groupby(['ica_cluster']).count().reset_index()
cnt.columns = ['ica_cluster', 'Number of unique IDs']
cnt['cluster_name'] = cnt.ica_cluster.map(lambda x: ica_cluster_name(x))
cnt.sort_values("cluster_name")
'# of unique IDs for Home: ',len(set(df_pivot_merge_sum[(df_pivot_merge_sum.ica_cluster==2)|(df_pivot_merge_sum.ica_cluster==3)]['id']))
cnt = df_pivot_merge_sum[['pca_cluster','id']].drop_duplicates()
cnt = cnt.groupby(['pca_cluster']).count().reset_index()
cnt.columns = ['pca_cluster', 'Number of unique IDs']
cnt['cluster_name'] = cnt.pca_cluster.map(lambda x: pca_cluster_name(x))
cnt.sort_values("cluster_name")
'# of unique IDs for Home: ',len(set(df_pivot_merge_sum[(df_pivot_merge_sum.pca_cluster==1)|(df_pivot_merge_sum.pca_cluster==2)]['id']))
TO DO: